import os
import numpy as np
import pandas as pd
#import plotly
#import plotly.express as px
from pplots_new import read_embeddings, plot_embedding, plot_embedding_interactive, rotate, get_colors
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
The user must provide a path to the input file in .mfasta format and path to the output directory for intermediate file storage:
path_to_PM = "/home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA"
mfasta = path_to_PM+"/examples/kinases/kinases.mfasta" # full path to the input MSA in mfasta format
path_out = path_to_PM+"/examples/kinases_test" # a directory to write resutling files
out_name = "kinases" # name given to the output files
path_to_figures = "/home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/figures"
All scripts necessary for data preparation are located in scirpts/data_preparation:
path_prep_scripts = path_to_PM+"/scripts/prepare_data"
Data preparation consists in .mfasta cleaning according to a gap threshold and translation of each sequence to the PSSM profile:
gapth = "0.9" # threshold for filtering gapped positions
prep_parameters = path_prep_scripts+" "+mfasta+" "+path_out+" "+out_name+" "+gapth # parameters for data preparation
# print(prep_parameters)
# print(path_prep_scripts+"/create_projection.sh " + prep_parameters)
os.system(path_prep_scripts+"/create_projection.sh " + prep_parameters)
print("Output files ready for projection are written to: "+path_out+"/fasta"+gapth)
Output files ready for projection are written to: /home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/examples/kinases_test/fasta0.9
You can change the parameters of the projection here:
knn = "5"
gamma = "2.00"
sigma = "1.00"
batchs = "4"
cospca = "0"
epochs = "1000"
seed = "0"
Then, the following command creates a projection of encoded sequences to a Poincaré disk:
path_to_build_PM = path_to_PM+"/scripts/build_poincare_map"
pm_command = "python "+path_to_build_PM+"/main.py --input_path "+path_out+"/fasta"+gapth+" --output_path "+path_out+"/projections/ --gamma "+gamma+" --pca "+ cospca+" --epochs "+epochs+" --seed "+seed
print(pm_command)
os.system(pm_command)
python /home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/scripts/build_poincare_map/main.py --input_path /home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/examples/kinases_test/fasta0.9 --output_path /home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/examples/kinases_test/projections/ --gamma 2.00 --pca 0 --epochs 1000 --seed 0
0
Parameters by default are provided in comments. The output files are then written to the following file:
path_embedding = path_out+"/projections/PM"+knn+"sigma="+sigma+"gamma="+gamma+"cosinepca="+cospca+"_seed"+seed+".csv"
print(path_embedding)
/home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/examples/kinases_test/projections/PM5sigma=1.00gamma=2.00cosinepca=0_seed0.csv
One can visualieze the resulting projection using any convenient coloring. To do so, the user shoud provide a .csv file with each line corresponding to a protein:
path_annotation = path_to_PM+"/visualization/data/kinase_group_new.csv" # path to annotation file
kinase_df = pd.read_csv(path_annotation, index_col = 0)
kinase_df
| 1_Group | 2_Gene | 3_HGNC | 4_Uni_entry | 5_Uni_acc | 6_Domain_begin | 7_Domain_end | 8_Domain_length | 9_Largest_insert_length | 10_PDB_validation | 11_Conformational_state | 12_Dihedral_state | 13_Group_in_Uni | 14_Group_in_Manning | 15_Synonymn | evo_distance | decile_domain | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| proteins_id | |||||||||||||||||
| 1 | AGC | AKT1 | HGNC:391 | AKT1_HUMAN | P31749 | 150 | 408 | 259 | 9 | 6NPZB | DFGin | BLAminus | AGC | AGC | PKB,RAC | 1.208266 | 2 |
| 2 | AGC | AKT2 | HGNC:392 | AKT2_HUMAN | P31751 | 152 | 409 | 258 | 9 | 3E8DB | DFGin | BLAminus | AGC | AGC | NaN | 1.208312 | 2 |
| 3 | AGC | AKT3 | HGNC:393 | AKT3_HUMAN | Q9Y243 | 148 | 405 | 258 | 9 | NaN | NaN | NaN | AGC | AGC | PKBG | 1.183463 | 2 |
| 4 | AGC | CDC42BPA | HGNC:1737 | MRCKA_HUMAN | Q5VT25 | 77 | 343 | 267 | 14 | NaN | NaN | NaN | AGC | AGC | KIAA0451 | 1.399119 | 5 |
| 5 | AGC | CDC42BPB | HGNC:1738 | MRCKB_HUMAN | Q9Y5S2 | 76 | 342 | 267 | 14 | 5OTFA | DFGin | BLAminus | AGC | AGC | KIAA1124 | 1.406213 | 5 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 493 | TYR | TYK2_1 | HGNC:12440 | TYK2_HUMAN | P29597 | 589 | 868 | 280 | 26 | 3ZONA | DFGin | NaN | TYR | TYR | NaN | 2.924877 | 7 |
| 494 | TYR | TYK2_2 | HGNC:12440 | TYK2_HUMAN | P29597 | 897 | 1171 | 275 | 21 | 4GVJA | DFGin | BLAminus | TYR | TYR | NaN | 2.031230 | 6 |
| 495 | TYR | TYRO3 | HGNC:12446 | TYRO3_HUMAN | Q06418 | 518 | 788 | 271 | 10 | NaN | NaN | NaN | TYR | TYR | BYK,DTK,RSE,SKY,TIF | 2.428364 | 6 |
| 496 | TYR | YES1 | HGNC:12841 | YES_HUMAN | P07947 | 277 | 528 | 252 | 10 | NaN | NaN | NaN | TYR | TYR | YES | 2.450070 | 0 |
| 497 | TYR | ZAP70 | HGNC:12858 | ZAP70_HUMAN | P43403 | 337 | 595 | 259 | 10 | 1U59A | DFGin | BLAminus | TYR | TYR | SRK | 2.194576 | 2 |
497 rows × 17 columns
A user can also create a custom color palette:
# construction of palette
kinase_palette = {-1 : "#c7c7c7", "OTHER": "#c7c7c7", "None" :"#c7c7c7", "NA" : "#c7c7c7", "Uncharacterized" : "#c7c7c7", "root": "#000000",
"TYR": "#bd065f", "CMGC": "#d5c203", "TKL": "#997e73","STE": "#80b412", # kinase groups
"CK1": "#0dbae9", "AGC": "#00bba1", "CAMK": "#1f6ed4", "NEK": "#8ce4fa", "RGC":"#f59a62"}
df = read_embeddings(path_embedding, path_annotation, withroot=False)
result: pm1 pm2 1_Group 2_Gene 3_HGNC 4_Uni_entry \
proteins_id
1 -0.707390 -0.592394 AGC AKT1 HGNC:391 AKT1_HUMAN
2 -0.716039 -0.569608 AGC AKT2 HGNC:392 AKT2_HUMAN
3 -0.694588 -0.567908 AGC AKT3 HGNC:393 AKT3_HUMAN
4 -0.397254 -0.890340 AGC CDC42BPA HGNC:1737 MRCKA_HUMAN
5 -0.393140 -0.887047 AGC CDC42BPB HGNC:1738 MRCKB_HUMAN
... ... ... ... ... ... ...
493 0.499884 0.780072 TYR TYK2_1 HGNC:12440 TYK2_HUMAN
494 0.218304 0.858630 TYR TYK2_2 HGNC:12440 TYK2_HUMAN
495 0.305759 0.830769 TYR TYRO3 HGNC:12446 TYRO3_HUMAN
496 0.402127 0.612690 TYR YES1 HGNC:12841 YES_HUMAN
497 0.469407 0.822037 TYR ZAP70 HGNC:12858 ZAP70_HUMAN
5_Uni_acc 6_Domain_begin 7_Domain_end 8_Domain_length \
proteins_id
1 P31749 150 408 259
2 P31751 152 409 258
3 Q9Y243 148 405 258
4 Q5VT25 77 343 267
5 Q9Y5S2 76 342 267
... ... ... ... ...
493 P29597 589 868 280
494 P29597 897 1171 275
495 Q06418 518 788 271
496 P07947 277 528 252
497 P43403 337 595 259
9_Largest_insert_length 10_PDB_validation \
proteins_id
1 9 6NPZB
2 9 3E8DB
3 9 NaN
4 14 NaN
5 14 5OTFA
... ... ...
493 26 3ZONA
494 21 4GVJA
495 10 NaN
496 10 NaN
497 10 1U59A
11_Conformational_state 12_Dihedral_state 13_Group_in_Uni \
proteins_id
1 DFGin BLAminus AGC
2 DFGin BLAminus AGC
3 NaN NaN AGC
4 NaN NaN AGC
5 DFGin BLAminus AGC
... ... ... ...
493 DFGin NaN TYR
494 DFGin BLAminus TYR
495 NaN NaN TYR
496 NaN NaN TYR
497 DFGin BLAminus TYR
14_Group_in_Manning 15_Synonymn evo_distance \
proteins_id
1 AGC PKB,RAC 1.208266
2 AGC NaN 1.208312
3 AGC PKBG 1.183463
4 AGC KIAA0451 1.399119
5 AGC KIAA1124 1.406213
... ... ... ...
493 TYR NaN 2.924877
494 TYR NaN 2.031230
495 TYR BYK,DTK,RSE,SKY,TIF 2.428364
496 TYR YES 2.450070
497 TYR SRK 2.194576
decile_domain
proteins_id
1 2
2 2
3 2
4 5
5 5
... ...
493 7
494 6
495 6
496 0
497 2
[497 rows x 19 columns]
Here follow several examples of kinase family visualization.
trace1 = plot_embedding_interactive(df,
labels_name = '1_Group',#'1_Group',#'2_Gene',
show_text=True,
color_palette = kinase_palette,
title = "Poinicaré Map projection colored by kinase groups",
fontsize = 10,
)
trace1.write_image(path_to_figures+"/Kinases_by_group.pdf")
trace1.show()
Index(['1_Group', '2_Gene', '3_HGNC', '4_Uni_entry', '5_Uni_acc',
'6_Domain_begin', '7_Domain_end', '8_Domain_length',
'9_Largest_insert_length', '10_PDB_validation',
'11_Conformational_state', '12_Dihedral_state', '13_Group_in_Uni',
'14_Group_in_Manning', '15_Synonymn', 'evo_distance', 'decile_domain'],
dtype='object')
You can highlight several points by using plotly iterface. Provide the corresponding column name in second_labels_name and a list of labels to show in labels_text:
trace2 = plot_embedding_interactive(df,
labels_name = '1_Group',#'1_Group',#'2_Gene',
show_text=True,
color_palette = kinase_palette,
title = "Poinicaré Map projection colored by kinase groups",
fontsize = 10,
second_labels_name = "4_Uni_entry",
#labels_text = ["ST17B_HUMAN", "MYLK2_HUMAN", "KALRN_HUMAN"]
labels_text = ["CLK3_HUMAN", "SRPK3_HUMAN", "HIPK1_HUMAN","CSK22_HUMAN"] # some CMGC kinase
#second_labels_name = "2_Gene",
#labels_text = ["RPS6KA1_1", "RPS6KA2_1", "RPS6KA5_1", "RPS6KB2", "RPS6KA1_2", "RPS6KA2_2", "RPS6KA3_2", "RPS6KA5_2"] # some sequences of AGC first domain and CAMK second domain (mentionned in the article) to label
)
trace2.show()
Index(['1_Group', '2_Gene', '3_HGNC', '4_Uni_entry', '5_Uni_acc',
'6_Domain_begin', '7_Domain_end', '8_Domain_length',
'9_Largest_insert_length', '10_PDB_validation',
'11_Conformational_state', '12_Dihedral_state', '13_Group_in_Uni',
'14_Group_in_Manning', '15_Synonymn', 'evo_distance', 'decile_domain'],
dtype='object')
You can also compare different colorings by assigning the labels_name variable a name of the desired column. Here we compare the kinase group coloring proposed in the article to the Uniprot name of kinase:
trace3 = plot_embedding_interactive(df,
labels_name = '13_Group_in_Uni',#'1_Group',#'2_Gene',
show_text=True,
color_palette = kinase_palette,
title = "PM projection on kinases by kinase groups in Uniprot - KNN 5 gamma 3 batchsize 4 epochs 1000",
fontsize = 10,
)
trace3.write_image(path_to_figures+"/Kinases_by_UniGroup.pdf")
trace3.show()
Index(['1_Group', '2_Gene', '3_HGNC', '4_Uni_entry', '5_Uni_acc',
'6_Domain_begin', '7_Domain_end', '8_Domain_length',
'9_Largest_insert_length', '10_PDB_validation',
'11_Conformational_state', '12_Dihedral_state', '13_Group_in_Uni',
'14_Group_in_Manning', '15_Synonymn', 'evo_distance', 'decile_domain'],
dtype='object')
...and to the classification provided in Manning et al. study:
trace4 = plot_embedding_interactive(df,
labels_name = '14_Group_in_Manning',#'1_Group',#'2_Gene',
show_text=True,
color_palette = kinase_palette,
second_labels_name = "2_Gene",
title = "Poincaré Maps projection by kinases groups according to Manning",
fontsize = 10,
#labels_text = ["MAP3K7, MAP3K9", "MAP3K10", "MAP3K11", "MAP3K12", "MAP3K13", "MAP3K20", "MAP3K21"] # STE dans TKL
#labels_text = ["AURKA", "AURKB", "AURKC", "CAMKK1", "CAMKK2", "PLK1", "PLK2", "PLK3", "PLK4"] # 10 kinases pourraient être CAMK
)
trace4.write_image(path_to_figures+"/Kinases_by_ManningGroup.pdf")
trace4.show()
Index(['1_Group', '2_Gene', '3_HGNC', '4_Uni_entry', '5_Uni_acc',
'6_Domain_begin', '7_Domain_end', '8_Domain_length',
'9_Largest_insert_length', '10_PDB_validation',
'11_Conformational_state', '12_Dihedral_state', '13_Group_in_Uni',
'14_Group_in_Manning', '15_Synonymn', 'evo_distance', 'decile_domain'],
dtype='object')